In [2]:
import pandas as pd
import numpy as np
In [3]:
# Load the data
file = 'C:/Users/Lajpat Rai/Desktop/Ryseron Data Science Diploma/CIND820 Project/data.csv'
df = pd.read_csv(file)
df.head() # let' see first 5 rows
Out[3]:
AT AP AH AFDP GTEP TIT TAT TEY CDP CO NOX
0 4.5878 1018.7 83.675 3.5758 23.979 1086.2 549.83 134.67 11.898 0.32663 81.952
1 4.2932 1018.3 84.235 3.5709 23.951 1086.1 550.05 134.67 11.892 0.44784 82.377
2 3.9045 1018.4 84.858 3.5828 23.990 1086.5 550.19 135.10 12.042 0.45144 83.776
3 3.7436 1018.3 85.434 3.5808 23.911 1086.5 550.17 135.03 11.990 0.23107 82.505
4 3.7516 1017.8 85.182 3.5781 23.917 1085.9 550.00 134.67 11.910 0.26747 82.028

1. Data Cleaning¶

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36733 entries, 0 to 36732
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      36733 non-null  float64
 1   AP      36733 non-null  float64
 2   AH      36733 non-null  float64
 3   AFDP    36733 non-null  float64
 4   GTEP    36733 non-null  float64
 5   TIT     36733 non-null  float64
 6   TAT     36733 non-null  float64
 7   TEY     36733 non-null  float64
 8   CDP     36733 non-null  float64
 9   CO      36733 non-null  float64
 10  NOX     36733 non-null  float64
dtypes: float64(11)
memory usage: 3.1 MB
In [4]:
# identify missing data
missing_data = df.isnull()
# Using a for loop in Python, the method ".value_counts()" counts the number of "True" values.
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")
AT
False    36733
Name: AT, dtype: int64

AP
False    36733
Name: AP, dtype: int64

AH
False    36733
Name: AH, dtype: int64

AFDP
False    36733
Name: AFDP, dtype: int64

GTEP
False    36733
Name: GTEP, dtype: int64

TIT
False    36733
Name: TIT, dtype: int64

TAT
False    36733
Name: TAT, dtype: int64

TEY
False    36733
Name: TEY, dtype: int64

CDP
False    36733
Name: CDP, dtype: int64

CO
False    36733
Name: CO, dtype: int64

NOX
False    36733
Name: NOX, dtype: int64

In [5]:
# print the summary
print(df.describe())
                 AT            AP            AH          AFDP          GTEP  \
count  36733.000000  36733.000000  36733.000000  36733.000000  36733.000000   
mean      17.712726   1013.070165     77.867015      3.925518     25.563801   
std        7.447451      6.463346     14.461355      0.773936      4.195957   
min       -6.234800    985.850000     24.085000      2.087400     17.698000   
25%       11.781000   1008.800000     68.188000      3.355600     23.129000   
50%       17.801000   1012.600000     80.470000      3.937700     25.104000   
75%       23.665000   1017.000000     89.376000      4.376900     29.061000   
max       37.103000   1036.600000    100.200000      7.610600     40.716000   

                TIT           TAT           TEY           CDP            CO  \
count  36733.000000  36733.000000  36733.000000  36733.000000  36733.000000   
mean    1081.428084    546.158517    133.506404     12.060525      2.372468   
std       17.536373      6.842360     15.618634      1.088795      2.262672   
min     1000.800000    511.040000    100.020000      9.851800      0.000388   
25%     1071.800000    544.720000    124.450000     11.435000      1.182400   
50%     1085.900000    549.880000    133.730000     11.965000      1.713500   
75%     1097.000000    550.040000    144.080000     12.855000      2.842900   
max     1100.900000    550.610000    179.500000     15.159000     44.103000   

                NOX  
count  36733.000000  
mean      65.293067  
std       11.678357  
min       25.905000  
25%       57.162000  
50%       63.849000  
75%       71.548000  
max      119.910000  
In [4]:
# Pairgrid plot between variables
import seaborn
import matplotlib.pyplot as plt
graph = seaborn.PairGrid(df[['AT','AP','AH','AFDP','GTEP','TIT','TAT','TEY','CDP','CO','NOX']],corner = True)
graph = graph.map_diag(plt.hist) 
# type of graph for non-diagonal 
graph = graph.map_offdiag(plt.scatter) 
# to add legends 
graph = graph.add_legend() 
# to show 
plt.show()
C:\Users\Lajpat Rai\anaconda3\anaconda\lib\site-packages\seaborn\axisgrid.py:156: UserWarning: The handle <matplotlib.patches.Patch object at 0x000001C9F036B220> has a label of '_nolegend_' which cannot be automatically added to the legend.
  figlegend = self._figure.legend(handles, labels, **kwargs)

2. Exploratory Data Analysis¶

In [5]:
# create histogram for each variabel to check normality.
import matplotlib.pyplot as plt
for i in df.columns:
    plt.hist(df[i], color = 'g')
    plt.title(i)
    plt.show()
  

Dataset is clean and has no missing values. Few parameters are normal except TIT, TEY, TAT, GTEP, CO & AH. Furthemore, Standard devitaion is small for all parameters hence most of data is close to mean. Data will be normalized before splitting it into training & testing data.

In [6]:
# Outliers detection
red_circle = dict(markerfacecolor = 'red', marker = 's', markeredgecolor= 'white')
fig, axs = plt.subplots(1,len(df.columns), figsize = (30,10))
for i, ax in enumerate(axs.flat):
    ax.boxplot(df.iloc[:,i], flierprops = red_circle)
    ax.set_title(df.columns[i], fontsize = 20, fontweight = 'bold')
    ax.tick_params(axis = 'y', labelsize = 20)
    
plt.tight_layout()
In [7]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=2, cols=2, start_cell="bottom-left")

fig.add_trace(go.Box(x=df['AT']),row=1, col=1)
fig.add_trace(go.Box(x=df['TAT']),row=2, col=1)
fig.update_layout(height=700, showlegend=False)
fig.show()

The boxplot shows that more input variables are outliers, so I'll use mean absolute error (MAE) to evaluate the model in modeling. The MAE is not sensitive to the outliers

3. Feature selection Filter Based Tecnhiques¶

3.1 Feature Selection with the help of correlation¶

In [9]:
# let's visualize the correlation thru heatmap
!pip install pandas --upgrade
import matplotlib.pyplot as plt
import seaborn as sns
corr_matrix = df.corr(method = 'pearson')
plt.figure(figsize=(11,9))
dropSelf = np.zeros_like(corr_matrix)
dropSelf[np.triu_indices_from(dropSelf)] = True

sns.heatmap(corr_matrix, cmap=sns.diverging_palette(220, 10, as_cmap=True), annot=True, fmt=".2f", mask=dropSelf)

sns.set(font_scale=1.5)
Requirement already satisfied: pandas in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (1.4.3)
Requirement already satisfied: pytz>=2020.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas) (2021.3)
Requirement already satisfied: numpy>=1.18.5 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas) (1.21.5)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)

The heatmap above shows the correlation between features and output variables(CO and NOx). It is easy to see that some features are negatively correlated each other. For example, the correlation between TIT and CO is -0.71, and it means that when the Turbine Inlet Temperature (TIT) decreases, the gas-turbine engine will produce more CO because a low TIT reduces the efficiency of the gas-turbine engine(look at the figure below). image.png

In [8]:
X = df.drop(columns = ['CO', 'NOX'])
Y_CO = df[['CO']]
Y_NOX = df[['NOX']]
print(X)
print(Y_CO)
           AT      AP      AH    AFDP    GTEP     TIT     TAT     TEY     CDP
0      4.5878  1018.7  83.675  3.5758  23.979  1086.2  549.83  134.67  11.898
1      4.2932  1018.3  84.235  3.5709  23.951  1086.1  550.05  134.67  11.892
2      3.9045  1018.4  84.858  3.5828  23.990  1086.5  550.19  135.10  12.042
3      3.7436  1018.3  85.434  3.5808  23.911  1086.5  550.17  135.03  11.990
4      3.7516  1017.8  85.182  3.5781  23.917  1085.9  550.00  134.67  11.910
...       ...     ...     ...     ...     ...     ...     ...     ...     ...
36728  3.6268  1028.5  93.200  3.1661  19.087  1037.0  541.59  109.08  10.411
36729  4.1674  1028.6  94.036  3.1923  19.016  1037.6  542.28  108.79  10.344
36730  5.4820  1028.5  95.219  3.3128  18.857  1038.0  543.48  107.81  10.462
36731  5.8837  1028.7  94.200  3.9831  23.563  1076.9  550.11  131.41  11.771
36732  6.0392  1028.8  94.547  3.8752  22.524  1067.9  548.23  125.41  11.462

[36733 rows x 9 columns]
             CO
0       0.32663
1       0.44784
2       0.45144
3       0.23107
4       0.26747
...         ...
36728  10.99300
36729  11.14400
36730  11.41400
36731   3.31340
36732  11.98100

[36733 rows x 1 columns]
In [9]:
#Find the information gain or mutual information of the independent variable with respect to a target variable NOX
import warnings
warnings.simplefilter("ignore")
from sklearn.feature_selection import mutual_info_regression
mi_NOX = mutual_info_regression(X, Y_NOX)
mi_NOX = pd.Series(mi_NOX)
mi_NOX.index = X.columns
mi_NOX.sort_values(ascending=False)
mi_NOX.sort_values(ascending=False).plot.bar(figsize=(10, 4))
Out[9]:
<AxesSubplot:>
In [10]:
#Find the information gain or mutual information of the independent variable with respect to a target variable CO
import warnings
warnings.simplefilter("ignore")
from sklearn.feature_selection import mutual_info_regression
mi_CO = mutual_info_regression(X, Y_CO)
mi_CO = pd.Series(mi_CO)
mi_CO.index = X.columns
mi_CO.sort_values(ascending=False)
mi_CO.sort_values(ascending=False).plot.bar(figsize=(10, 4))
Out[10]:
<AxesSubplot:>
In [11]:
from sklearn.feature_selection import SelectKBest
sel_five_cols = SelectKBest(mutual_info_regression, k=5)
sel_five_cols.fit(X, Y_CO)
X.columns[sel_five_cols.get_support()]
Out[11]:
Index(['AFDP', 'GTEP', 'TIT', 'TEY', 'CDP'], dtype='object')
In [12]:
from sklearn.feature_selection import SelectKBest
sel_five_cols = SelectKBest(mutual_info_regression, k=5)
sel_five_cols.fit(X, Y_NOX)
X.columns[sel_five_cols.get_support()]
Out[12]:
Index(['AT', 'GTEP', 'TIT', 'TEY', 'CDP'], dtype='object')

4 Wrapper technique for feature selection¶

In [13]:
!pip install mlxtend
Requirement already satisfied: mlxtend in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (0.20.0)
Requirement already satisfied: scikit-learn>=1.0.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.0.2)
Requirement already satisfied: numpy>=1.16.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.21.5)
Requirement already satisfied: joblib>=0.13.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.1.0)
Requirement already satisfied: setuptools in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (61.2.0)
Requirement already satisfied: matplotlib>=3.0.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (3.5.1)
Requirement already satisfied: pandas>=0.24.2 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.4.3)
Requirement already satisfied: scipy>=1.2.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from mlxtend) (1.7.3)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.4)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (4.25.0)
Requirement already satisfied: cycler>=0.10 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (0.11.0)
Requirement already satisfied: pillow>=6.2.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (9.0.1)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (1.3.2)
Requirement already satisfied: packaging>=20.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (21.3)
Requirement already satisfied: pytz>=2020.1 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from pandas>=0.24.2->mlxtend) (2021.3)
Requirement already satisfied: six>=1.5 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\lajpat rai\anaconda3\anaconda\lib\site-packages (from scikit-learn>=1.0.2->mlxtend) (2.2.0)

4.1 Forward Selection — SFS() from mlxtend¶

In [15]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
           k_features=5,
           forward=True,
           floating=False,
           scoring = 'r2',
           cv = 0)

X_train, X_test, y_train_CO, y_test_CO = train_test_split(X, Y_CO, test_size = 0.30)
#Use SFS to select the top 5 features 
sfs.fit(X_train, y_train_CO)

#Create a dataframe for the SFS results 
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
Out[15]:
feature_idx cv_scores avg_score feature_names
1 (5,) [0.49154555980999715] 0.491546 (TIT,)
2 (5, 6) [0.5426697239666705] 0.54267 (TIT, TAT)
3 (2, 5, 6) [0.5472493402224305] 0.547249 (AH, TIT, TAT)
4 (2, 5, 6, 7) [0.5483558819190959] 0.548356 (AH, TIT, TAT, TEY)
5 (2, 5, 6, 7, 8) [0.5499379260261003] 0.549938 (AH, TIT, TAT, TEY, CDP)
In [19]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
sfs = SFS(LinearRegression(), 
          k_features=9, 
          forward=True, 
          floating=False, 
          scoring='r2',
          cv=5)

sfs = sfs.fit(X_train, y_train_CO)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()
In [20]:
# SFS for depedent variable NOX
X_train, X_test, y_train_NOX, y_test_NOX = train_test_split(X, Y_NOX, test_size = 0.30)
#Use SFS to select the top 5 features 
sfs.fit(X_train, y_train_NOX)

#Create a dataframe for the SFS results 
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
Out[20]:
feature_idx cv_scores avg_score feature_names
1 (0,) [0.3085781398449149, 0.30774129376079484, 0.31... 0.310286 (AT,)
2 (0, 4) [0.33757096687736354, 0.33999106256866596, 0.3... 0.341815 (AT, GTEP)
3 (0, 2, 4) [0.3674012766765754, 0.3711325441491694, 0.361... 0.368251 (AT, AH, GTEP)
4 (0, 2, 3, 4) [0.38397353356680997, 0.39322499554317447, 0.3... 0.384841 (AT, AH, AFDP, GTEP)
5 (0, 2, 3, 4, 6) [0.389435681282967, 0.3981896870544349, 0.3824... 0.391489 (AT, AH, AFDP, GTEP, TAT)
6 (0, 2, 3, 4, 5, 6) [0.4211516298498681, 0.4278002258922331, 0.411... 0.424148 (AT, AH, AFDP, GTEP, TIT, TAT)
7 (0, 2, 3, 4, 5, 6, 7) [0.5038012493696647, 0.4970458112953008, 0.493... 0.499962 (AT, AH, AFDP, GTEP, TIT, TAT, TEY)
8 (0, 1, 2, 3, 4, 5, 6, 7) [0.5157340389697513, 0.515294191608797, 0.5071... 0.514645 (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY)
9 (0, 1, 2, 3, 4, 5, 6, 7, 8) [0.5159032451514862, 0.5154487744407628, 0.507... 0.514794 (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY, CDP)
In [21]:
sfs = SFS(LinearRegression(), 
          k_features=9, 
          forward=True, 
          floating=False, 
          scoring='r2',
          cv=5)

sfs = sfs.fit(X_train, y_train_NOX)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

4.2 Backward elimination — RFE() from Sklearn¶

In [23]:
from sklearn.feature_selection import RFE
#Build a logistic regression model 
#model = LinearRegression()
#Define RFE 
rfe = RFE(estimator=LinearRegression(), n_features_to_select=5)
#Use RFE to select the top 5 features 
fit = rfe.fit(X_train, y_train_CO)

features = X.columns.to_list()
df_RFE_results = []
for i in range(X.shape[1]):
    df_RFE_results.append(
        {      
            'Feature_names': features[i],
            'Selected':  rfe.support_[i],
            'RFE_ranking':  rfe.ranking_[i],
        }
    )

df_RFE_results = pd.DataFrame(df_RFE_results)
df_RFE_results.index.name='Columns'
df_RFE_results
Out[23]:
Feature_names Selected RFE_ranking
Columns
0 AT False 4
1 AP True 1
2 AH False 2
3 AFDP True 1
4 GTEP True 1
5 TIT False 5
6 TAT False 3
7 TEY True 1
8 CDP True 1
In [24]:
#Use RFE to select the top 5 features for target variable NOX
fit = rfe.fit(X_train, y_train_NOX)

features = X.columns.to_list()
df_RFE_results = []
for i in range(X.shape[1]):
    df_RFE_results.append(
        {      
            'Feature_names': features[i],
            'Selected':  rfe.support_[i],
            'RFE_ranking':  rfe.ranking_[i],
        }
    )

df_RFE_results = pd.DataFrame(df_RFE_results)
df_RFE_results.index.name='Columns'
df_RFE_results
Out[24]:
Feature_names Selected RFE_ranking
Columns
0 AT True 1
1 AP False 3
2 AH False 4
3 AFDP False 2
4 GTEP False 5
5 TIT True 1
6 TAT True 1
7 TEY True 1
8 CDP True 1

# backward elimintaion using SFS¶

In [27]:
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
           k_features=5,
           forward=False,
           floating=False,
           scoring = 'r2',
           cv = 0)
#Use SFS to select the top 5 features 
sfs.fit(X_train, y_train_CO)

#Create a dataframe for the SFS results 
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
Out[27]:
feature_idx cv_scores avg_score feature_names
9 (0, 1, 2, 3, 4, 5, 6, 7, 8) [0.00034212702315228327] 0.000342 (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY, CDP)
8 (0, 1, 2, 3, 4, 6, 7, 8) [0.00034146715969340935] 0.000341 (AT, AP, AH, AFDP, GTEP, TAT, TEY, CDP)
7 (1, 2, 3, 4, 6, 7, 8) [0.00033952330224518157] 0.00034 (AP, AH, AFDP, GTEP, TAT, TEY, CDP)
6 (1, 2, 3, 4, 7, 8) [0.00033883378229537797] 0.000339 (AP, AH, AFDP, GTEP, TEY, CDP)
5 (1, 2, 4, 7, 8) [0.0003276362169429836] 0.000328 (AP, AH, GTEP, TEY, CDP)
In [26]:
sfs = SFS(LinearRegression(),
           k_features=5,
           forward=False,
           floating=False,
           scoring = 'r2',
           cv = 0)
#Use SFS to select the top 5 features 
sfs.fit(X_train, y_train_NOX)

#Create a dataframe for the SFS results 
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
Out[26]:
feature_idx cv_scores avg_score feature_names
9 (0, 1, 2, 3, 4, 5, 6, 7, 8) [0.5155441018083071] 0.515544 (AT, AP, AH, AFDP, GTEP, TIT, TAT, TEY, CDP)
8 (0, 1, 2, 3, 5, 6, 7, 8) [0.5155229974796036] 0.515523 (AT, AP, AH, AFDP, TIT, TAT, TEY, CDP)
7 (0, 1, 2, 3, 5, 6, 7) [0.5153216712663955] 0.515322 (AT, AP, AH, AFDP, TIT, TAT, TEY)
6 (0, 1, 2, 5, 6, 7) [0.5146116580880893] 0.514612 (AT, AP, AH, TIT, TAT, TEY)
5 (0, 2, 5, 6, 7) [0.5001254614255183] 0.500125 (AT, AH, TIT, TAT, TEY)

4.3 Step-wise Selection — SFFS() from mlxtend¶

In [34]:
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
           k_features=5,
           forward=True,
           floating=True,
           scoring = 'r2',
           cv = 0)
#Use SFS to select the top 5 features 
sfs.fit(X, Y_CO)

#Create a dataframe for the SFS results 
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
Out[34]:
feature_idx cv_scores avg_score feature_names
1 (5,) [0.49882444817014526] 0.498824 (TIT,)
2 (5, 6) [0.5507217644680591] 0.550722 (TIT, TAT)
3 (2, 5, 6) [0.5552174969842719] 0.555217 (AH, TIT, TAT)
4 (2, 5, 6, 7) [0.55644618886337] 0.556446 (AH, TIT, TAT, TEY)
5 (2, 5, 6, 7, 8) [0.5581241617573389] 0.558124 (AH, TIT, TAT, TEY, CDP)
In [31]:
#Define Sequential Forward Selection (sfs)
sfs = SFS(LinearRegression(),
           k_features=5,
           forward=True,
           floating=True,
           scoring = 'r2',
           cv = 0)
#Use SFS to select the top 5 features 
sfs.fit(X, Y_NOX)

#Create a dataframe for the SFS results 
df_SFS_results = pd.DataFrame(sfs.subsets_).transpose()
df_SFS_results
Out[31]:
feature_idx cv_scores avg_score feature_names
1 (0,) [0.31155773122511365] 0.311558 (AT,)
2 (0, 4) [0.34261205608496104] 0.342612 (AT, GTEP)
3 (0, 2, 4) [0.3691172137915859] 0.369117 (AT, AH, GTEP)
4 (0, 2, 3, 4) [0.3870041880302296] 0.387004 (AT, AH, AFDP, GTEP)
5 (0, 2, 3, 4, 5) [0.3947387026237553] 0.394739 (AT, AH, AFDP, GTEP, TIT)
In [29]:
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

5. Embedded Feature Selection Methods¶

5.1 Random Forest¶

In [64]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
In [70]:
# Normalize feature vector
X1 = StandardScaler().fit_transform(X)
# Split the dataset
X_train, X_test, y_train_CO, y_test_CO = train_test_split(X1, Y_CO, test_size = 0.30)

# selecting the features
sel = SelectFromModel(RandomForestRegressor(n_estimators = 100,max_features=5))
sel.fit(X_train, y_train_CO)
Out[70]:
SelectFromModel(estimator=RandomForestRegressor(max_features=5))
In [75]:
from matplotlib import pyplot
# define the model
model = RandomForestRegressor()
# fit the model
model.fit(X_train, y_train_CO)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar(X.columns, importance)
plt.title("Feature importance using Ridge Model for CO")
pyplot.show()
Feature: 0, Score: 0.04851
Feature: 1, Score: 0.03015
Feature: 2, Score: 0.04312
Feature: 3, Score: 0.05387
Feature: 4, Score: 0.03842
Feature: 5, Score: 0.49951
Feature: 6, Score: 0.22018
Feature: 7, Score: 0.04007
Feature: 8, Score: 0.02617
In [76]:
# Try different numbers of n_estimators 
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train_CO)
    scores.append(model.score(X_test, y_test_CO))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)
Out[76]:
[<matplotlib.lines.Line2D at 0x1c9b2492b50>]
In [78]:
y_pred_CO = model.predict(X_test)
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_CO, y_pred_CO))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_CO, y_pred_CO))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_CO, y_pred_CO)))
Mean Absolute Error: 0.48083399434592605
Mean Squared Error: 1.0347359233139202
Root Mean Squared Error: 1.017219702578514
In [73]:
from matplotlib import pyplot
# define the model
model = RandomForestRegressor()
# fit the model
model.fit(X_train, y_train_NOX)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar(X.columns, importance)
plt.title("Feature importance using Ridge Model for NOX")
pyplot.show()
Feature: 0, Score: 0.12818
Feature: 1, Score: 0.11821
Feature: 2, Score: 0.14085
Feature: 3, Score: 0.12954
Feature: 4, Score: 0.10377
Feature: 5, Score: 0.07823
Feature: 6, Score: 0.11035
Feature: 7, Score: 0.09496
Feature: 8, Score: 0.09589
In [79]:
y_pred_NOX = model.predict(X_test)
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_NOX, y_pred_NOX))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_NOX, y_pred_NOX))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_NOX, y_pred_NOX)))
Mean Absolute Error: 62.78462970727042
Mean Squared Error: 4081.8323665636367
Root Mean Squared Error: 63.889219486261034
In [80]:
# Try different numbers of n_estimators 
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train_NOX)
    scores.append(model.score(X_test, y_test_NOX))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)
Out[80]:
[<matplotlib.lines.Line2D at 0x1c9a724a040>]

5.2 Lasso Regression¶

In [52]:
from sklearn.linear_model import LassoCV
lcv = LassoCV()
# Split the dataset
X_train, X_test, y_train_CO, y_test_CO = train_test_split(X1, Y_CO, test_size = 0.30, random_state = 0)

lcv.fit(X_train, y_train_CO)

print("Best alpha using built-in LassoCV: %f" % lcv.alpha_)
print("Best score using built-in LassoCV: %f" %lcv.score(X_train,y_train_CO))
coef = pd.Series(lcv.coef_, index = X.columns)

print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
Best alpha using built-in LassoCV: 0.001593
Best score using built-in LassoCV: 0.559507
Lasso picked 9 variables and eliminated the other 0 variables
In [53]:
from sklearn.linear_model import Lasso
l1 = Lasso(alpha = 0.001593).fit(X_train, y_train_CO)
#print("Best score using built-in LassoCV: %f" %l1.score(X_train,y_train_CO))
coef = pd.Series(l1.coef_, index = X.columns)
coef
Out[53]:
AT     -0.255437
AP      0.000267
AH     -0.113708
AFDP   -0.101240
GTEP    0.381959
TIT    -1.343750
TAT    -0.577548
TEY    -2.249451
CDP     1.470458
dtype: float64
In [54]:
# Lasso Regression with different alpha values
import seaborn as sns
import matplotlib.pyplot as plt
cf = pd.DataFrame()
cf['features'] = X.columns

fig = plt.figure(figsize = (27,30))

i = 0

for a in [0.0001,0.001,0.01,0.5,1]:
    l2 = Lasso(alpha = a).fit(X_train, y_train_CO)
    l2_predict = l2.predict(X_test)
    
    cf['Lasso Regression'] = np.absolute(l2.coef_.ravel())
    sub = fig.add_subplot(2,3,i+1)
    plt.subplots_adjust(hspace = 0.3)
    sub.set_title('Lasso - ' + str(a), fontsize = 15)
    sns.barplot(x = "features", y = "Lasso Regression", data = cf, palette = 'RdYlBu_r')
    i = i+1
    
In [55]:
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model for CO")
Out[55]:
Text(0.5, 1.0, 'Feature importance using Lasso Model for CO')
In [56]:
import sklearn.metrics as metrics
from scipy import stats
# Make Prediction
Y_pred_CO = lcv.predict(X_test)

# Return Results
print("The R-square is %0.2f"% lcv.score(X_test, y_test_CO))
print('Lasso Regression MAE: %0.1f'%metrics.mean_absolute_error(y_test_CO,Y_pred_CO))
print('Lasso Regression MSE:%0.1f'%metrics.mean_squared_error(y_test_CO,Y_pred_CO))
print('Lasso Regression RMSE:%0.1f'%np.sqrt(metrics.mean_squared_error(y_test_CO,Y_pred_CO)))
se_lcv = stats.sem(Y_pred_CO)
The R-square is 0.57
Lasso Regression MAE: 0.9
Lasso Regression MSE:2.2
Lasso Regression RMSE:1.5
In [43]:
# Split the dataset
X_train, X_test, y_train_NOX, y_test_NOX = train_test_split(X1, Y_NOX, test_size = 0.30, random_state = 0)

lcv.fit(X_train, y_train_NOX)

print("Best alpha using built-in LassoCV: %f" % lcv.alpha_)
print("Best score using built-in LassoCV: %f" %lcv.score(X_train,y_train_NOX))
coef = pd.Series(lcv.coef_, index = X.columns)

print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")
Best alpha using built-in LassoCV: 0.006519
Best score using built-in LassoCV: 0.518887
Lasso picked 9 variables and eliminated the other 0 variables
In [44]:
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model for NOX")
Out[44]:
Text(0.5, 1.0, 'Feature importance using Lasso Model for NOX')
In [51]:
import sklearn.metrics as metrics
from scipy import stats
# Make Prediction
Y_pred_NOX = lcv.predict(X_test)

# Return Results
print("The R-square is %0.2f"% lcv.score(X_test, y_test_NOX))
print('Lasso Regression MAE: %0.1f'%metrics.mean_absolute_error(y_test_NOX,Y_pred_NOX))
print('Lasso Regression MSE:%0.1f'%metrics.mean_squared_error(y_test_NOX,Y_pred_NOX))
print('Lasso Regression RMSE:%0.1f'%np.sqrt(metrics.mean_squared_error(y_test_NOX,Y_pred_NOX)))
se_lcv = stats.sem(Y_pred_NOX)
The R-square is 0.51
Lasso Regression MAE: 5.8
Lasso Regression MSE:66.0
Lasso Regression RMSE:8.1